// DDF: given the number of nodes and a vector of what the space is (i.e. the
// tag) it can generate which node holds the data ([0..N-1]) at runtime.
// 

#include "CNCTask.hpp"
#include "CNCDeps.hpp"
#include "../support/Threading.hpp"

#include "../common/definitions.h"

struct computePreds {
  // condsE must be a tuple holding <zero, false, ... false>
  template<int TASK_ID, int TASKTAGDIM, int INTERTASKDIM, typename... BOOLS, typename... DEPDISTS>
  static void compute(TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> const & tt,
                      std::tuple<BOOLS...> &condsE, std::tuple<DEPDISTS...> &dists) {
    // static_assert(tuple_size);
    static constexpr int NPARAMS = TASKTAGDIM - INTERTASKDIM;
    
    // Tuple holding <One, t1 ... t_INTERTASKDIM, p1, ... p_NPARAMS>
    static constexpr auto termsE = 
      std::tuple_cat(make_tuple(ConstantTerm()), 
                     makeCompositeTuple<VariableTerm, INTERTASKDIM+NPARAMS>::make());
    //
    /////////////////////////////////////////////////////////////////////////////////    
    // Still TBD how to compress this with templates
    //
    static constexpr auto t1 = std::get<1>(termsE);
    static constexpr auto t2 = std::get<2>(termsE);
    static constexpr auto t3 = std::get<3>(termsE);
    static constexpr auto pT = std::get<1+INTERTASKDIM>(termsE);
    // static constexpr auto pM = std::get<2+INTERTASKDIM>(termsE);
    static constexpr auto pN = std::get<3+INTERTASKDIM>(termsE);
    // static constexpr auto pP = std::get<4+INTERTASKDIM>(termsE);
    
    static constexpr auto p1 = 
      FOR(t1, CEIL(-pN-15,16), FLOOR(pT-3,16));
    static constexpr auto p2 = 
      FOR(t2, MAX(t1,-t1-1), 
          MIN(MIN(FLOOR(-8*t1+pT-2,8),FLOOR(8*t1+pN+7,8)),FLOOR(pT+pN-2,16)));
    static constexpr auto p3 = 
      FOR (t3, MAX(MAX(0,CEIL(t1+t2-1,2)),CEIL(16*t2-pN-14,16)), 
           MIN(MIN(FLOOR(pT+pN-2,16),FLOOR(16*t2+pN+14,16)),FLOOR(8*t1+8*t2+pN+15,16)));

    static constexpr auto lbsE = 
      std::make_tuple(ConstantTerm(), 
                      (t1 >= p1.first), (t2 >= p2.first), (t3 >= p3.first));
    static constexpr auto ubsE = 
      std::make_tuple(ConstantTerm(), 
                      (t1 <= p1.second), (t2 <= p2.second), t3 <= (p3.second));

    //
    // END Still TBD how to compress this with templates
    /////////////////////////////////////////////////////////////////////////////////  
    //

    // tuple holding <1, tt[0], ... tt[INTERTASKDIM+NPARAMS-1]>
    auto tagAsTupleE = std::tuple_cat(make_tuple(1), tt.toTuple());

    computePredsHelper<TASKTAGDIM, INTERTASKDIM, 1, INTERTASKDIM>::compute
      (condsE, dists, lbsE, ubsE, tagAsTupleE);
  }
};


template<typename TUNER>
void initializeTunerOnce(TUNER &) {
}

template<typename STEP>
void initializeStepOnce(STEP &) {
  // stringstream ss;
  // ss << t;
  // Threading::dumpThreadInfo(std::cerr, ss.str());
  Threading::pinThread();
}

template<int TASK_ID, int TASKTAGDIM, int INTERTASKDIM>
template<class DC>
void CNCTask_tuner<TASK_ID, TASKTAGDIM, INTERTASKDIM>::depends
(const TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> & t, CNCTask_context & c, DC & dc ) const {  
  TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> tt(t);

  if (!initialized.local()) {
    initializeTunerOnce(*this);
    initialized.local() = true;    
  }
  
#ifdef MEASURE_OVERHEADS
  Timer oh(GlobalTimers::Singleton().getOverheadGlobal());
#endif

  {
    CNCRuntimeWrapper<CNCTask_context, decltype(c.getItemCollection(tt)), DC> 
      W(c, c.getItemCollection(tt), dc);

    TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> tts(t);
    tag_elem_t d1 = 1, d2 = 1, d3 = 1;
    auto dists = std::make_tuple(ConstantTerm(), d1, d2, d3);
    auto condsE = makeIdenticalTuple<bool, INTERTASKDIM+1>::make(false);
    computePreds::compute<TASK_ID, TASKTAGDIM, INTERTASKDIM> 
      (tts, condsE, dists);
    CNCDeps<TASK_ID, TASKTAGDIM, INTERTASKDIM>::genDeps(W, tts, condsE, dists);

    if (std::get<1>(condsE) && std::get<2>(condsE) && std::get<3>(condsE)) {
      CNCDeps<TASK_ID, TASKTAGDIM, INTERTASKDIM>::genZeroDep(W, tts, condsE);
    }
  }

  return;
}

template <int TASK_ID, int TASKTAGDIM, int INTERTASKDIM>
int64_t CNCTask_step<TASK_ID, TASKTAGDIM, INTERTASKDIM>::execute
(const TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> &t, CNCTask_context & c) const {
  if (!initialized) {
    initializeStepOnce(*this);
    initialized = true;    
  }

  { // Auto-generated polyhedral loops
    int64_t t1 = t[0];
    int64_t t2 = t[1];
    int64_t t3 = t[2];
    int64_t pT = t[0+INTERTASKDIM];
    int64_t pM = t[1+INTERTASKDIM];
    int64_t pN = t[2+INTERTASKDIM];
    int64_t pP = t[3+INTERTASKDIM];

    double (*M)[2][Npadded][Npadded][Npadded] = (double (*)[2][Npadded][Npadded][Npadded])(c.wA);
    int64_t t4, t5, t6, t7, t8, lbv, ubv;
    for (t4=max(max(max(0,ceild(t1+t2-124,125)),ceild(16*t2-pN-998,1000)),ceild(16*t3-pN-998,1000));t4<=min(min(min(min(floord(8*t1+pN+7,500),floord(pT+pN-2,1000)),floord(16*t2+pN+14,1000)),floord(16*t3+pN+14,1000)),floord(8*t1+8*t2+pN+15,1000));t4++) {
      for (t5=max(max(max(max(max(0,8*t1+8*t2),16*t1+1),16*t2-pN),16*t3-pN),1000*t4-pN);t5<=min(min(min(min(min(pT-2,16*t2+14),16*t3+14),1000*t4+998),8*t1+8*t2+15),16*t1+pN+15);t5++) {
        if (t5%2 == 0) {
          for (t6=max(max(16*t2,t5+1),-16*t1+2*t5-15);t6<=min(min(-16*t1+2*t5,16*t2+15),t5+pN);t6++) {
            for (t7=max(16*t3,t5+1);t7<=min(16*t3+15,t5+pN);t7++) {
              lbv=max(1000*t4,t5+1);
              ubv=min(1000*t4+999,t5+pN);
#ifdef DEBUG_NUMA_MEM
              Threading::dumpMemInfo(std::cerr, &M[0][0][-t5+t6][-t5+t7][-t5+lbv]);
              Threading::dumpMemInfo(std::cerr, &M[0][1][-t5+t6][-t5+t7][-t5+lbv]);
#endif
#pragma vector always
#pragma ivdep
              for (t8=lbv;t8<=ubv;t8++) {
                M[0][1][-t5+t6][-t5+t7][-t5+t8]=0.125*(M[0][0][-t5+t6+1][-t5+t7][-t5+t8]-2.0*M[0][0][-t5+t6][-t5+t7][-t5+t8]+M[0][0][-t5+t6-1][-t5+t7][-t5+t8])+0.125*(M[0][0][-t5+t6][-t5+t7+1][-t5+t8]-2.0*M[0][0][-t5+t6][-t5+t7][-t5+t8]+M[0][0][-t5+t6][-t5+t7-1][-t5+t8])+0.125*(M[0][0][-t5+t6][-t5+t7][-t5+t8-1]-2.0*M[0][0][-t5+t6][-t5+t7][-t5+t8]+M[0][0][-t5+t6][-t5+t7][-t5+t8+1])+M[0][0][-t5+t6][-t5+t7][-t5+t8];;
              }
            }
          }
        }else{
          for (t6=max(max(16*t2,t5+1),-16*t1+2*t5-15);t6<=min(min(-16*t1+2*t5,16*t2+15),t5+pN);t6++) {
            for (t7=max(16*t3,t5+1);t7<=min(16*t3+15,t5+pN);t7++) {
              lbv=max(1000*t4,t5+1);
              ubv=min(1000*t4+999,t5+pN);
#ifdef DEBUG_NUMA_MEM
              Threading::dumpMemInfo(std::cerr, &M[0][0][-t5+t6][-t5+t7][-t5+lbv]);
              Threading::dumpMemInfo(std::cerr, &M[0][1][-t5+t6][-t5+t7][-t5+lbv]);
#endif

#pragma ivdep
#pragma vector always
              for (t8=lbv;t8<=ubv;t8++) {
                M[0][0][-t5+t6][-t5+t7][-t5+t8]=0.125*(M[0][1][-t5+t6+1][-t5+t7][-t5+t8]-2.0*M[0][1][-t5+t6][-t5+t7][-t5+t8]+M[0][1][-t5+t6-1][-t5+t7][-t5+t8])+0.125*(M[0][1][-t5+t6][-t5+t7+1][-t5+t8]-2.0*M[0][1][-t5+t6][-t5+t7][-t5+t8]+M[0][1][-t5+t6][-t5+t7-1][-t5+t8])+0.125*(M[0][1][-t5+t6][-t5+t7][-t5+t8-1]-2.0*M[0][1][-t5+t6][-t5+t7][-t5+t8]+M[0][1][-t5+t6][-t5+t7][-t5+t8+1])+M[0][1][-t5+t6][-t5+t7][-t5+t8];;
              }
            }
          }
        }
      }
    }

    // Make sure tag_elem_t can hold all values, upcasted to int64_t to avoid
    // sign/zero extends and control flow overhead in the loops.
    TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> 
      t(static_cast<tag_elem_t>(t1),  static_cast<tag_elem_t>(t2),  
        static_cast<tag_elem_t>(t3),  static_cast<tag_elem_t>(pT),  
        static_cast<tag_elem_t>(pM),  static_cast<tag_elem_t>(pN),  
        static_cast<tag_elem_t>(pP));
    auto &items = c.getItemCollection(t);
    items.put(t, nullptr);
#if (VERBOSE >= 11)
    std::cerr << "Putting tag: " << t << "\n";
#endif
  }

#if (VERBOSE >= 11)
  std::cerr << "Successfully done with " << t << "\n\n";
#endif

  return CnC::CNC_Success;
}


template <int TASK_ID, int NPARAMS, int TASKTAGDIM, int INTERTASKDIM> 
void CNCTask <TASK_ID, NPARAMS, TASKTAGDIM, INTERTASKDIM>::operator()() {
  CNCTask_context c(arrayPtr);

#if (VERBOSE >= 1)
  CnC::debug::collect_scheduler_statistics( c );
#endif

#ifdef DEBUG_CNC
  CnC::debug::trace( c.m_tags );
  CnC::debug::trace( c.items );
#endif // DEBUG_CNC 

  tbb::tick_count tick = tbb::tick_count::now();
  // std::chrono::time_point<std::chrono::system_clock> tick = 
  //   std::chrono::system_clock::now();

#ifndef ISOLATE_SEQ_OVERHEADS
  TaskTag<0, TASKTAGDIM, INTERTASKDIM> t
    (toArray(makeIdenticalTuple<tag_elem_t, TASKTAGDIM>::make(ZERO_TASK_INDEX)));
  c.getItemCollection(t).put(t, arrayPtr);
#if (VERBOSE >= 20)
  std::cerr << "Putting tag: " << t << "\n";
#endif
#ifdef MEASURE_OVERHEADS
  GlobalTimers::init();
#endif
#endif

  const int64_t pT = params[0];
  const int64_t pM = params[1];
  const int64_t pN = params[2];
  const int64_t pP = params[3];
  { // Auto-generated polyhedral loops
    int64_t t1, t2, t3;
    for (t1=ceild(-pN-15,16);t1<=floord(pT-3,16);t1++) {
      for (t2=max(t1,-t1-1);
           t2<=min(min(floord(-8*t1+pT-2,8),floord(8*t1+pN+7,8)),floord(pT+pN-2,16));t2++) {
        for (t3=max(max(0,ceild(t1+t2-1,2)),ceild(16*t2-pN-14,16));
             t3<=min(min(floord(pT+pN-2,16),floord(16*t2+pN+14,16)),floord(8*t1+8*t2+pN+15,16));
             t3++) {

          // TaskTag<TASK_ID, TASKTAGDIM, INTERTASKDIM> 
          //   t(static_cast<tag_elem_t>(t1),  static_cast<tag_elem_t>(t2),  
          //     static_cast<tag_elem_t>(t3),  static_cast<tag_elem_t>(pT),  
          //     static_cast<tag_elem_t>(pM),  static_cast<tag_elem_t>(pN),  
          //     static_cast<tag_elem_t>(pP));
          // c.m_tags.put(t);

          // #if (VERBOSE >= 20)                                  
          //           std::cerr << "Putting tag: " << t << "\n";         
          // #endif   

          {
#define __TASK_TAG_INIT__
#include "AutoGenSupport.hpp"
            RSTREAM_TASK_TAG_INIT(0, TASKTAGDIM, INTERTASKDIM, 
                                  (tag_elem_t)t1, (tag_elem_t)t2, (tag_elem_t)t3,
                                  (tag_elem_t)pT, (tag_elem_t)pM, (tag_elem_t)pN,
                                  (tag_elem_t)pP);
#undef __TASK_TAG_INIT__
          }
                                          
        }
      }
    }
  }



#ifdef ISOLATE_SEQ_OVERHEADS
  printf("Pure sequential startup cost : %g sec\n", 
         (tbb::tick_count::now()-tick).seconds());
  // printf("Pure sequential startup cost : %g sec\n", 
  //        ((std::chrono::duration_cast<std::chrono::microseconds>
  //          (std::chrono::system_clock::now() - tick).count()) / 1000000.0));
  printf("Total time will not include this cost!\n");
  sleep(1);
#ifdef MEASURE_OVERHEADS
  GlobalTimers::init();
#endif
  tick = tbb::tick_count::now();
  // tick = std::chrono::system_clock::now();
  c.getItemCollection(t).put(TaskTag<0, TASKTAGDIM, INTERTASKDIM>
              (toArray(makeIdenticalTuple<tag_elem_t, TASKTAGDIM>::
                       make(ZERO_TASK_INDEX))), arrayPtr);
#endif

  // Wait for all steps to finish
  c.wait();

  printf("Total time for init and parallel execution of %ld iterations of a " \
         "stencil of size %ld x %ld x %ld: %g sec\n", 
         pT, pN, pN, pN, (tbb::tick_count::now()-tick).seconds());
  // printf("Total time for init and parallel execution of %ld iterations of a " 
  //        "CNCTask of size %ld x %ld x %ld: %g sec\n", pT, pN, pN, pN, 
  //        ((std::chrono::duration_cast<std::chrono::microseconds>
  //          (std::chrono::system_clock::now() - tick).count()) / 1000000.0));
}

#include "explicitInstantiations.cpp.include"
